#!/usr/bin/env python

import nltk
import urllib2
from nltk import word_tokenize

url = 'http://www.gutenberg.org/files/2554/2554.txt'
# url = 'http://www.baidu.com'
# response = urllib2.urlopen(url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
raw = response.read().decode('utf8')
print type(raw)
print len(raw)

tokens = word_tokenize(raw)
print type(tokens)
print len(tokens)
print tokens[:100]
